Importing libraries

#load packages
library(ggplot2)
library(gridExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(reshape2)

Loading data

#load data
df <- read.csv('data/data_breach.csv')
head(df)
##   Date.Made.Public                        Company     State Type.of.breach
## 1       2009-10-21  Bullitt County Public Schools  Kentucky           DISC
## 2       2009-10-21 Roane State Community College  Tennessee           PORT
## 3       2009-10-15                 Halifax Health   Florida           PORT
## 4       2009-10-04      Suffolk Community College  New York           DISC
## 5       2009-09-28               Penrose Hospital  Colorado           PHYS
## 6       2009-09-23    Eastern Kentucky University  Kentucky           DISC
##   Type.of.organization Total.Records
## 1                  EDU           676
## 2                  EDU         14783
## 3                  MED         33000
## 4                  EDU           300
## 5                  MED           175
## 6                  EDU          5045
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Description.of.incident
## 1                                                                                                                                                                                                                                                                                               A Bullitt County Public Schools \n            employee accidentally sent an e-mail message to about 1,800 school \n            district workers that included the names and Social Security numbers \n            of 676 district employees. The employees were identified as not having \n            completed the district's 2010 open-enrollment process for insurance, \n            and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n            has announced that the names and Social Security numbers of 9,747 \n            current or former students were on a data storage device stolen from \n            an employee's vehicle, along with 1,194 current/former employees' \n            information. The Social Security numbers alone, with no names, were \n            also stolen for 5,036 additional current or former students. The data \n            was on a 4GB USB drive used for work-related purposes. An employee \n            took it home to do work after hours, and left it in the car. The employee \n            forgot to lock the car doors. The USB drive was stolen along with \n            a personal hand-held device.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   A laptop computer from a Halifax \n            Health employee's vehicle in Orange County was stolen -- which might \n            have contained password protected patient information.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                               Suffolk Community College has \n            agreed to pay a company for the next year to monitor the credit of \n            300 students whose last names and Social Security numbers were mistakenly \n            listed in an attachment to an e-mail sent to those students last month.
## 5                                                                                                                                                                                                                                                                                                                     Officials at Penrose Hospital              believe someone has stolen the personal information of 175 patients.              The missing information consists of names, addresses, phone numbers,              Social Security numbers and the reason for the patients' visits. The              information was stored on a computer print-out and kept in a binder              stored in a cabinet. The print out has gone missing.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         The names and Social Security \n            numbers of about 5,000 Eastern Kentucky University faculty, staff \n            and student workers were posted inadvertently on the Internet last \n            September, where they have been displayed for a year.
##   Information.Source Source.URL Year.of.Breach Latitude  Longitude
## 1        Dataloss DB                      2009 37.98840  -85.71579
## 2        Dataloss DB                      2009 35.93396  -84.55244
## 3        Dataloss DB                      2009 29.21082  -81.02283
## 4        Dataloss DB                      2009 40.86649  -73.03566
## 5        Dataloss DB                      2009 38.83388 -104.82136
## 6        Dataloss DB                      2009 37.74786  -84.29465
##   breach_type_gA breach_type_gB breach_type_gC breach_type_gD  region_A
## 1         Others        Insider         Others        Network     South
## 2         Others          Mixed        Offline          Local     South
## 3         Others          Mixed        Offline          Local     South
## 4         Others        Insider         Others        Network Northeast
## 5         Others          Mixed        Offline          Local      West
## 6         Others        Insider         Others        Network     South
##    region_B age_of_Law   org_group records_group records_index
## 1 Southeast          0 F_regulated        Others  0.0008129202
## 2 Southeast         14 F_regulated        Others  0.0177772189
## 3 Southeast         14 F_regulated        Others  0.0396839765
## 4 Northeast         14 F_regulated        Others  0.0003607634
## 5      West         13 F_regulated        Others  0.0002104453
## 6 Southeast          0 F_regulated        Others  0.0060668382
##   records_scale type_scale org_scale severity_index1 severity_index2
## 1            10          1         8              11              18
## 2            10          7         8              17              18
## 3            10          7        10              17              20
## 4            10          1         8              11              18
## 5            10          7        10              17              20
## 6            10          1         8              11              18
##   severity_index3 severity_index4 severity_index5
## 1              19        6.333333       22.333333
## 2              25        8.333333        2.333333
## 3              27        9.000000        3.000000
## 4              19        6.333333       22.333333
## 5              27        9.000000        3.000000
## 6              19        6.333333       22.333333
# subset of data breaches in US, excluing those ouside of US.
df <- subset(df, region_A != 'Non_US')
head(df)
##   Date.Made.Public                        Company     State Type.of.breach
## 1       2009-10-21  Bullitt County Public Schools  Kentucky           DISC
## 2       2009-10-21 Roane State Community College  Tennessee           PORT
## 3       2009-10-15                 Halifax Health   Florida           PORT
## 4       2009-10-04      Suffolk Community College  New York           DISC
## 5       2009-09-28               Penrose Hospital  Colorado           PHYS
## 6       2009-09-23    Eastern Kentucky University  Kentucky           DISC
##   Type.of.organization Total.Records
## 1                  EDU           676
## 2                  EDU         14783
## 3                  MED         33000
## 4                  EDU           300
## 5                  MED           175
## 6                  EDU          5045
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Description.of.incident
## 1                                                                                                                                                                                                                                                                                               A Bullitt County Public Schools \n            employee accidentally sent an e-mail message to about 1,800 school \n            district workers that included the names and Social Security numbers \n            of 676 district employees. The employees were identified as not having \n            completed the district's 2010 open-enrollment process for insurance, \n            and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n            has announced that the names and Social Security numbers of 9,747 \n            current or former students were on a data storage device stolen from \n            an employee's vehicle, along with 1,194 current/former employees' \n            information. The Social Security numbers alone, with no names, were \n            also stolen for 5,036 additional current or former students. The data \n            was on a 4GB USB drive used for work-related purposes. An employee \n            took it home to do work after hours, and left it in the car. The employee \n            forgot to lock the car doors. The USB drive was stolen along with \n            a personal hand-held device.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   A laptop computer from a Halifax \n            Health employee's vehicle in Orange County was stolen -- which might \n            have contained password protected patient information.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                               Suffolk Community College has \n            agreed to pay a company for the next year to monitor the credit of \n            300 students whose last names and Social Security numbers were mistakenly \n            listed in an attachment to an e-mail sent to those students last month.
## 5                                                                                                                                                                                                                                                                                                                     Officials at Penrose Hospital              believe someone has stolen the personal information of 175 patients.              The missing information consists of names, addresses, phone numbers,              Social Security numbers and the reason for the patients' visits. The              information was stored on a computer print-out and kept in a binder              stored in a cabinet. The print out has gone missing.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         The names and Social Security \n            numbers of about 5,000 Eastern Kentucky University faculty, staff \n            and student workers were posted inadvertently on the Internet last \n            September, where they have been displayed for a year.
##   Information.Source Source.URL Year.of.Breach Latitude  Longitude
## 1        Dataloss DB                      2009 37.98840  -85.71579
## 2        Dataloss DB                      2009 35.93396  -84.55244
## 3        Dataloss DB                      2009 29.21082  -81.02283
## 4        Dataloss DB                      2009 40.86649  -73.03566
## 5        Dataloss DB                      2009 38.83388 -104.82136
## 6        Dataloss DB                      2009 37.74786  -84.29465
##   breach_type_gA breach_type_gB breach_type_gC breach_type_gD  region_A
## 1         Others        Insider         Others        Network     South
## 2         Others          Mixed        Offline          Local     South
## 3         Others          Mixed        Offline          Local     South
## 4         Others        Insider         Others        Network Northeast
## 5         Others          Mixed        Offline          Local      West
## 6         Others        Insider         Others        Network     South
##    region_B age_of_Law   org_group records_group records_index
## 1 Southeast          0 F_regulated        Others  0.0008129202
## 2 Southeast         14 F_regulated        Others  0.0177772189
## 3 Southeast         14 F_regulated        Others  0.0396839765
## 4 Northeast         14 F_regulated        Others  0.0003607634
## 5      West         13 F_regulated        Others  0.0002104453
## 6 Southeast          0 F_regulated        Others  0.0060668382
##   records_scale type_scale org_scale severity_index1 severity_index2
## 1            10          1         8              11              18
## 2            10          7         8              17              18
## 3            10          7        10              17              20
## 4            10          1         8              11              18
## 5            10          7        10              17              20
## 6            10          1         8              11              18
##   severity_index3 severity_index4 severity_index5
## 1              19        6.333333       22.333333
## 2              25        8.333333        2.333333
## 3              27        9.000000        3.000000
## 4              19        6.333333       22.333333
## 5              27        9.000000        3.000000
## 6              19        6.333333       22.333333

Extraordinary incidents

# extraordinary incidents
df_ext <- subset(df, records_group == 'Extraordinary')
summary(df_ext)
##    Date.Made.Public
##  2014-08-05:1      
##  2016-12-14:1      
##  2017-03-08:1      
##  2006-01-01:0      
##  2006-01-12:0      
##  2006-01-16:0      
##  (Other)   :0      
##                                                                   Company 
##  River City Media                                                     :1  
##  Russian hacking discovered by Hold Security                          :1  
##  Yahoo                                                                :1  
##            Spring Independent School District             (Spring, TX):0  
##    Harvard University                                                 :0  
##    Penn State University                                              :0  
##  (Other)                                                              :0  
##         State   Type.of.breach Type.of.organization Total.Records      
##  California:1   CARD:0         BSF:0                Min.   :1.000e+09  
##  Oregon    :1   DISC:1         BSO:3                1st Qu.:1.185e+09  
##  Wisconsin :1   HACK:2         BSR:0                Median :1.370e+09  
##  Alabama   :0   INSD:0         EDU:0                Mean   :1.790e+09  
##  Alaska    :0   PHYS:0         GOV:0                3rd Qu.:2.185e+09  
##  Arizona   :0   PORT:0         MED:0                Max.   :3.000e+09  
##  (Other)   :0   STAT:0         NGO:0                                   
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Description.of.incident 
##  "A gang of Russian hackers has amassed over 1 billion username and \npassword combinations and more than 500 million email addresses, a \nsecurity firm reported late Tuesday, calling it the largest-ever haul of\n stolen Internet credentials.The massive trove — stolen from \nhundreds of thousands of websites — was discovered by the Milwaukee firm\n Hold Security, according to a post on its website".According to reports by Hold Security,  it took over seven months to identify the gang, "whom the firm dubbed CyberVor, or \ncyber-thief in Russian".\n \nIt appears that no payment card information or Social Security numbers were threatened.PRC will provide updates as the story unfolds.  *note: state location provided is that of Hold Security LLC.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        :1  
##  "One of the world's allegedly most prolific spamming operations inadvertently left backup databases accessible online, exposing upwards of 1.37 billion records and a raft of internal company information.Chris Vickery, a security researcher who works for the anti-virus company MacKeeper, discovered the databases, which belong to a US-based email and SMS marketing company called River City Media. In some cases, the records include the names, IP addresses, zip codes and physical addresses associated with the email addresses.The cause of the data exposure appears to be an oversight. The company used the rsync protocol to backup its MySQL databases. But those backup servers were not password-protected, Vickery says in an email to Information Security Media Group.The leak could be one of the largest of all time, but it's likely the databases contain duplicates. The databases, which were exposed for at least three months, have since been taken offline. It's unclear if other fraudsters or hackers may have already stumbled upon it. Some of records were updated as recently as January."If the databases were to be released in the wild, the damage would be astounding," Vickery says. "Abusive ex-boyfriends and stalkers everywhere would have a fresh new source of information on victims. You wouldn't feel the damage all at once, but society would indeed suffer over time."Based on preliminarily checks, at least some of the exposed data is legitimate, Vickery writes in a blog post."Investigating names from the list, through social media and work websites, usually shows that the additional details in the entry are most likely accurate," Vickery writes."More Information: http://www.databreachtoday.com/backup-error-exposes-137-billion-record-s...                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           :1  
##  "Yahoo Inc (YHOO.O) warned on Wednesday that it had uncovered yet another massive cyber attack, saying data from more than 1 billion user accounts was compromised in August 2013, making it the largest breach in history.The number of affected accounts was double the number implicated in 2014 breach that the internet company disclosed in September and blamed on hackers working on behalf of a government.Yahoo required all of its customers to reset their passwords - a stronger measure than it took after the previous breach was discovered, when it only recommended a password reset. Yahoo also said Wednesday that it believes hackers responsible for the previous breach had also accessed the company’s proprietary code to learn how to forge "cookies" that would allow hackers to access an account without a password."Yahoo badly screwed up," said Bruce Schneier, a cryptologist and one of the world's most respected security experts. "They weren't taking security seriously and that's now very clear. I would have trouble trusting Yahoo going forward."Yahoo was tentative in its description of new problems, saying the incident was "likely" distinct from the one it reported in September and that stolen information "may have included" names, e-mail addresses, telephone numbers, dates of birth, hashed passwords and, in some cases, encrypted or unencrypted security questions and answers."More information: http://www.reuters.com/article/us-yahoo-cyber-idUSKBN1432WZYahoo statement: https//yahoo.com/security-updateUPDATE (2/15/2017):"Yahoo's newly issued warning to users about malicious hacks is related to a third data breach that the company disclosed in December 2016.A warning sent to some Yahoo users Wednesday read: "Based on the ongoing investigation, we believe a forged cookie may have been used in 2015 or 2016 to access your account."This breach was previously revealed in a December 2016 statement from Yahoo that also provided information on a separate hack that occurred in August 2013 involving more than 1 billion accounts. In addition, some of the 2015 and 2016 incidents have been tied to a "state-sponsored actor" that was involved in a different 2014 breach that affected up to 500 million accounts."Forged cookies" are digital keys that allow access to information without re-entering passwords. The leaked data included email addresses, birth dates and answers to security questions. Yahoo declined to say how many people were affected."More information: http://www.cnbc.com/2017/02/15/yahoo-sends-new-warning-to-customers-abou...UPDATE (3/15/2017): The U.S. Justice Department today unsealed indictments against four men accused of hacking into half-billion Yahoo email accounts.  Two of the men named in the indictments worked for a unit of the Russian Federal Security Services (FSB) that serves as the FBI's point of contact in Moscow on cybercrime cases."More Information: http://krebsonsecurity.com/UPDATE (9/7/2017): Link to Yahoo judgement: https://www.documentcloud.org/documents/3986196-Yahoo-judgement-on-data-...UPDATE (10/3/2017): "Yahoo has tripled down on what was already the largest data breach in history, saying it affected all 3 billion accounts on its service, not the 1 billion it revealed late last year.The company announced Tuesday that it's providing notice to additional user accounts affected by the August 2013 data theft."More Information: http://hosted.ap.org/dynamic/stories/U/US_YAHOO_DATA_BREACH?SITE=AP&SECT...                                                                                                                                                                                                       :1  
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        :0  
##   \t \t \t \t \t"NEW YORK -- Millions of records from a commercial corporate database have been leaked. \t \t \t \t \tThe database, about 52 gigabytes in size, contains just under 33.7 million unique email addresses and other contact information from employees of thousands of companies, representing a large portion of the US corporate population. \t \t \t \t \tDun & Bradstreet, a business services giant, confirmed that it owns the database, which it acquired as part  \tof a 2015 deal to buy NetProspex for $125 million.The purchased database contains dozens of fields, some including personal information such as names, job titles and functions, work email addresses, and phone numbers. \t \t \t \t \tOther information includes more generic corporate and publicly sourced data, such as believed office location, the number of employees in the business unit, and other descriptions of the kind of industry the company falls into, such as advertising, legal,  media and broadcasting, and telecoms. \t \t \t \t \tThis entire database is used for marketers who want to directly target their own email campaigns and through other communications methods for current and prospective customers. \t \t \t \t \tThe data can be bought either in bulk, or by type of record by companies, but it's not known exactly how much the going rate is for a full data set of this size. We understand from  \ta 2015 brochure that the cost of accessing a half-million records can cost some firms up to $200,000. \t \t \t \t \tTroy Hunt, who runs breach notification site  \tHave I Been Pwned, obtained the database and analyzed the records. \t \t \t \t \tIn  \ta blog post Tuesday, Hunt said the breakdown was entirely US-focused, with California as the most represented demographic with over four million records, then New York with 2.7 million records and Texas with 2.6 million records. \t \t \t \t \tHunt's analysis of the records showed that the leading organization by records is the Dept. of Defense, with 101,013 employee records, followed closely by the US Postal Service with 88,153 employee records. \t \t \t \t \tThe US Army, Air Force, and Dept. of Veterans Affairs are all listed with a combined 76,379 records. \t \t \t \t \tAT&T, Boeing, Dell, FedEx, IBM, and Xerox were among the most named companies in the database, with tens of thousands of employee records each. \t \t \t \t \t"Whilst you could piece together parts of the data from information already in the public domain, having it aggregated and so easily searchable in this fashion is enormously valuable," said Hunt in an email on Tuesday. "It also serves as a reminder that we've lost control of our privacy; the vast majority of people in the data set would have no idea their information is being sold in this fashion and they certainly don't have any control over it." \t \t \t \t \tHunt ran the exposed database through  \tHave I Been Pwned's database of breached records, which showed 14 percent of email addresses already existed in his database. \t \t \t \t \tThe data is now searchable in  \tHave I Been Pwned. \t \t \t \t \tBut it's not known exactly how the data was exposed, or who is to blame for the leak.  \t \t \t \t \tA spokesperson for Dun & Bradshaw would not talk on the record beyond an emailed statement, sent prior to publication. \t \t \t \t \t"We've carefully evaluated the information that was shared with us and it is of a type and in a format that we deliver to customers every day. Based on our analysis, it was not accessed or exposed through a Dun & Bradstreet system," the statement read."More Information: http://www.zdnet.com/article/millions-of-records-leaked-from-huge-corpor...:0  
##                                                                                                  As\n reported by Health and Human Services unauthorized access/disclosure. No specific information as\n to what information was  \ncompromised  as provided by health and human services. More Information: https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf;jsessionid=9BF4AF...                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               :0  
##  (Other)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               :0  
##                    Information.Source
##  Media                      :3       
##                             :0       
##  California Attorney General:0       
##  Databreaches.net           :0       
##  Dataloss DB                :0       
##  Government Agency          :0       
##  (Other)                    :0       
##                                                                                                                            Source.URL
##                                                                                                                                 :3   
##  http://6abc.com/student-loan-data-breach-affects-16500-borrowers/3402556/                                                      :0   
##  http://abc30.com/fresno-state-data-breach-exposes-personal-information-of-15000-people/3182146/                                :0   
##  http://abc7.com/technology/30k-ucla-students-warned-about-potential-security-breach/2279390/                                   :0   
##  http://agportal-s3bucket.s3.amazonaws.com/uploadedfiles/Another/Supporting_Law_Enforcement/MultnomahAthleticClub.2018-01-10.pdf:0   
##  http://enewspaper.latimes.com/infinity/article_popover_share.aspx?guid=0511a587-c9aa-4ea2-a331-64f54856baeb                    :0   
##  (Other)                                                                                                                        :0   
##  Year.of.Breach    Latitude       Longitude       breach_type_gA
##  Min.   :2014   Min.   :37.37   Min.   :-122.68   Hacker:2      
##  1st Qu.:2015   1st Qu.:39.63   1st Qu.:-122.36   Others:1      
##  Median :2016   Median :41.90   Median :-122.04                 
##  Mean   :2016   Mean   :41.60   Mean   :-110.89                 
##  3rd Qu.:2016   3rd Qu.:43.71   3rd Qu.:-104.99                 
##  Max.   :2017   Max.   :45.52   Max.   : -87.95                 
##                                                                 
##   breach_type_gB breach_type_gC breach_type_gD      region_A      region_B
##  Insider :1      Offline:0      Local  :0      Midwest  :1   Midwest  :1  
##  Mixed   :0      Online :2      Network:3      Non_US   :0   Non_US   :0  
##  Outsider:2      Others :1      Others :0      Northeast:0   Northeast:0  
##                                                South    :0   Southeast:0  
##                                                West     :2   Southwest:0  
##                                                              West     :2  
##                                                                           
##    age_of_Law           org_group       records_group records_index   
##  12     :1    F_regulated    :0   Extraordinary:3     Min.   :0.5587  
##  13     :1    non_F_regulated:3   Others       :0     1st Qu.:0.6620  
##  16     :1                                            Median :0.7654  
##  0      :0                                            Mean   :1.0000  
##  10     :0                                            3rd Qu.:1.2207  
##  11     :0                                            Max.   :1.6760  
##  (Other):0                                                            
##  records_scale   type_scale     org_scale severity_index1 severity_index2
##  Min.   :10    Min.   : 1.0   Min.   :6   Min.   :11.0    Min.   :16     
##  1st Qu.:10    1st Qu.: 5.5   1st Qu.:6   1st Qu.:15.5    1st Qu.:16     
##  Median :10    Median :10.0   Median :6   Median :20.0    Median :16     
##  Mean   :10    Mean   : 7.0   Mean   :6   Mean   :17.0    Mean   :16     
##  3rd Qu.:10    3rd Qu.:10.0   3rd Qu.:6   3rd Qu.:20.0    3rd Qu.:16     
##  Max.   :10    Max.   :10.0   Max.   :6   Max.   :20.0    Max.   :16     
##                                                                          
##  severity_index3 severity_index4 severity_index5 
##  Min.   :17.0    Min.   :5.667   Min.   : 5.333  
##  1st Qu.:21.5    1st Qu.:7.167   1st Qu.: 5.333  
##  Median :26.0    Median :8.667   Median : 5.333  
##  Mean   :23.0    Mean   :7.667   Mean   :10.333  
##  3rd Qu.:26.0    3rd Qu.:8.667   3rd Qu.:12.833  
##  Max.   :26.0    Max.   :8.667   Max.   :20.333  
## 

Checking information of dataset

# subset of data breaches excluing the extraordinary incidents
df <- subset(df, region_A != 'Extraordinary')
head(df)
##   Date.Made.Public                        Company     State Type.of.breach
## 1       2009-10-21  Bullitt County Public Schools  Kentucky           DISC
## 2       2009-10-21 Roane State Community College  Tennessee           PORT
## 3       2009-10-15                 Halifax Health   Florida           PORT
## 4       2009-10-04      Suffolk Community College  New York           DISC
## 5       2009-09-28               Penrose Hospital  Colorado           PHYS
## 6       2009-09-23    Eastern Kentucky University  Kentucky           DISC
##   Type.of.organization Total.Records
## 1                  EDU           676
## 2                  EDU         14783
## 3                  MED         33000
## 4                  EDU           300
## 5                  MED           175
## 6                  EDU          5045
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Description.of.incident
## 1                                                                                                                                                                                                                                                                                               A Bullitt County Public Schools \n            employee accidentally sent an e-mail message to about 1,800 school \n            district workers that included the names and Social Security numbers \n            of 676 district employees. The employees were identified as not having \n            completed the district's 2010 open-enrollment process for insurance, \n            and the e-mail was intended as a reminder to complete the process.
## 2 Roane State Community College \n            has announced that the names and Social Security numbers of 9,747 \n            current or former students were on a data storage device stolen from \n            an employee's vehicle, along with 1,194 current/former employees' \n            information. The Social Security numbers alone, with no names, were \n            also stolen for 5,036 additional current or former students. The data \n            was on a 4GB USB drive used for work-related purposes. An employee \n            took it home to do work after hours, and left it in the car. The employee \n            forgot to lock the car doors. The USB drive was stolen along with \n            a personal hand-held device.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   A laptop computer from a Halifax \n            Health employee's vehicle in Orange County was stolen -- which might \n            have contained password protected patient information.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                               Suffolk Community College has \n            agreed to pay a company for the next year to monitor the credit of \n            300 students whose last names and Social Security numbers were mistakenly \n            listed in an attachment to an e-mail sent to those students last month.
## 5                                                                                                                                                                                                                                                                                                                     Officials at Penrose Hospital              believe someone has stolen the personal information of 175 patients.              The missing information consists of names, addresses, phone numbers,              Social Security numbers and the reason for the patients' visits. The              information was stored on a computer print-out and kept in a binder              stored in a cabinet. The print out has gone missing.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         The names and Social Security \n            numbers of about 5,000 Eastern Kentucky University faculty, staff \n            and student workers were posted inadvertently on the Internet last \n            September, where they have been displayed for a year.
##   Information.Source Source.URL Year.of.Breach Latitude  Longitude
## 1        Dataloss DB                      2009 37.98840  -85.71579
## 2        Dataloss DB                      2009 35.93396  -84.55244
## 3        Dataloss DB                      2009 29.21082  -81.02283
## 4        Dataloss DB                      2009 40.86649  -73.03566
## 5        Dataloss DB                      2009 38.83388 -104.82136
## 6        Dataloss DB                      2009 37.74786  -84.29465
##   breach_type_gA breach_type_gB breach_type_gC breach_type_gD  region_A
## 1         Others        Insider         Others        Network     South
## 2         Others          Mixed        Offline          Local     South
## 3         Others          Mixed        Offline          Local     South
## 4         Others        Insider         Others        Network Northeast
## 5         Others          Mixed        Offline          Local      West
## 6         Others        Insider         Others        Network     South
##    region_B age_of_Law   org_group records_group records_index
## 1 Southeast          0 F_regulated        Others  0.0008129202
## 2 Southeast         14 F_regulated        Others  0.0177772189
## 3 Southeast         14 F_regulated        Others  0.0396839765
## 4 Northeast         14 F_regulated        Others  0.0003607634
## 5      West         13 F_regulated        Others  0.0002104453
## 6 Southeast          0 F_regulated        Others  0.0060668382
##   records_scale type_scale org_scale severity_index1 severity_index2
## 1            10          1         8              11              18
## 2            10          7         8              17              18
## 3            10          7        10              17              20
## 4            10          1         8              11              18
## 5            10          7        10              17              20
## 6            10          1         8              11              18
##   severity_index3 severity_index4 severity_index5
## 1              19        6.333333       22.333333
## 2              25        8.333333        2.333333
## 3              27        9.000000        3.000000
## 4              19        6.333333       22.333333
## 5              27        9.000000        3.000000
## 6              19        6.333333       22.333333
# check the dimention, column names and structure of the dataset
dim(df)
## [1] 5355   30
names(df)
##  [1] "Date.Made.Public"        "Company"                
##  [3] "State"                   "Type.of.breach"         
##  [5] "Type.of.organization"    "Total.Records"          
##  [7] "Description.of.incident" "Information.Source"     
##  [9] "Source.URL"              "Year.of.Breach"         
## [11] "Latitude"                "Longitude"              
## [13] "breach_type_gA"          "breach_type_gB"         
## [15] "breach_type_gC"          "breach_type_gD"         
## [17] "region_A"                "region_B"               
## [19] "age_of_Law"              "org_group"              
## [21] "records_group"           "records_index"          
## [23] "records_scale"           "type_scale"             
## [25] "org_scale"               "severity_index1"        
## [27] "severity_index2"         "severity_index3"        
## [29] "severity_index4"         "severity_index5"
str(df)
## 'data.frame':    5355 obs. of  30 variables:
##  $ Date.Made.Public       : Factor w/ 2548 levels "2006-01-01","2006-01-12",..: 692 692 690 686 684 682 681 681 680 678 ...
##  $ Company                : Factor w/ 4732 levels "          Spring Independent School District             (Spring, TX)",..: 602 3348 1663 3780 3080 1267 453 3407 3366 4246 ...
##  $ State                  : Factor w/ 61 levels "Alabama","Alaska",..: 23 53 13 39 9 23 54 8 61 13 ...
##  $ Type.of.breach         : Factor w/ 7 levels "CARD","DISC",..: 2 6 6 2 5 2 6 5 2 2 ...
##  $ Type.of.organization   : Factor w/ 7 levels "BSF","BSO","BSR",..: 4 4 6 4 6 4 1 6 1 4 ...
##  $ Total.Records          : num  676 14783 33000 300 175 ...
##  $ Description.of.incident: Factor w/ 4371 levels ""," \t \t \t \t \t\"NEW YORK -- Millions of records from a commercial corporate database have been leaked. \t \t \"| __truncated__,..: 331 3353 918 3515 2871 3929 2639 4157 520 2453 ...
##  $ Information.Source     : Factor w/ 16 levels "","California Attorney General",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Source.URL             : Factor w/ 95 levels "","http://6abc.com/student-loan-data-breach-affects-16500-borrowers/3402556/",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Year.of.Breach         : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
##  $ Latitude               : num  38 35.9 29.2 40.9 38.8 ...
##  $ Longitude              : num  -85.7 -84.6 -81 -73 -104.8 ...
##  $ breach_type_gA         : Factor w/ 2 levels "Hacker","Others": 2 2 2 2 2 2 2 2 2 2 ...
##  $ breach_type_gB         : Factor w/ 3 levels "Insider","Mixed",..: 1 2 2 1 2 1 2 2 1 1 ...
##  $ breach_type_gC         : Factor w/ 3 levels "Offline","Online",..: 3 1 1 3 1 3 1 1 3 3 ...
##  $ breach_type_gD         : Factor w/ 3 levels "Local","Network",..: 2 1 1 2 1 2 1 1 2 2 ...
##  $ region_A               : Factor w/ 5 levels "Midwest","Non_US",..: 4 4 4 3 5 4 4 5 5 4 ...
##  $ region_B               : Factor w/ 6 levels "Midwest","Non_US",..: 4 4 4 3 6 4 5 6 6 4 ...
##  $ age_of_Law             : Factor w/ 18 levels "0","10","11",..: 1 6 6 6 5 1 2 7 4 6 ...
##  $ org_group              : Factor w/ 2 levels "F_regulated",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ records_group          : Factor w/ 2 levels "Extraordinary",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ records_index          : num  0.000813 0.017777 0.039684 0.000361 0.00021 ...
##  $ records_scale          : num  10 10 10 10 10 ...
##  $ type_scale             : int  1 7 7 1 7 1 7 7 1 1 ...
##  $ org_scale              : int  8 8 10 8 10 8 9 10 9 8 ...
##  $ severity_index1        : num  11 17 17 11 17 ...
##  $ severity_index2        : num  18 18 20 18 20 ...
##  $ severity_index3        : num  19 25 27 19 27 ...
##  $ severity_index4        : num  6.33 8.33 9 6.33 9 ...
##  $ severity_index5        : num  22.33 2.33 3 22.33 3 ...

The dataset has 5574 rows with 27 columns. The data type of age_of_Law is Factor, need to be converted to integer.

# convert data type of age_of_Law from Factor to integer
df$age_of_Law <-  as.numeric(as.character(df$age_of_Law))
class(df$age_of_Law)
## [1] "numeric"
# description of the dataset
summary(df)
##    Date.Made.Public                       Company            State     
##  2016-02-26:  18    University of Florida     :  10   California: 657  
##  2013-11-08:  15    Experian                  :   7   New York  : 407  
##  2011-12-09:  14    Henry Ford Health System  :   7   Texas     : 398  
##  2012-08-03:  12    Mount Sinai Medical Center:   7   Florida   : 320  
##  2013-08-28:  10    Private Medical Practice  :   7   Maryland  : 272  
##  2012-07-17:   9    Walgreen Co.              :   7   Illinois  : 226  
##  (Other)   :5277    (Other)                   :5310   (Other)   :3075  
##  Type.of.breach Type.of.organization Total.Records      
##  CARD:  32      BSF: 321             Min.   :1.000e+00  
##  DISC:1268      BSO: 382             1st Qu.:6.520e+02  
##  HACK:1344      BSR: 250             Median :2.006e+03  
##  INSD: 357      EDU: 565             Mean   :1.811e+06  
##  PHYS:1362      GOV: 465             3rd Qu.:1.018e+04  
##  PORT: 819      MED:3310             Max.   :3.000e+09  
##  STAT: 173      NGO:  62                                
##                                                                             Description.of.incident
##  Location of breached information: Network Server\nBusiness associate present: No\n       : 147    
##  Location of breached information: Email\nBusiness associate present: No\n                : 107    
##  Location of breached information: Paper/Films\nBusiness associate present: No\n          :  90    
##  \\N\nLocation of breached information: Laptop\nBusiness associate present: No\n          :  80    
##  \\N\nLocation of breached information: Paper/Films\nBusiness associate present: No\n     :  67    
##  \\N\nLocation of breached information: Desktop Computer\nBusiness associate present: No\n:  48    
##  (Other)                                                                                  :4816    
##                                   Information.Source
##  US Department of Health and Human Services:1946    
##  Dataloss DB                               :1213    
##  Media                                     : 564    
##  Databreaches.net                          : 445    
##  PHIPrivacy.net                            : 370    
##  Government Agency                         : 238    
##  (Other)                                   : 579    
##                                                                                                Source.URL  
##                                                                                                     :3125  
##  https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf                                             :1947  
##  http://www.marylandattorneygeneral.gov/Pages/IdentityTheft/breachnotices.aspx                      : 157  
##  http://www.marylandattorneygeneral.gov/Pages/IdentityTheft/breachnotices.aspx?subfolder=2015       :  36  
##  http://www.healthcareitnews.com/news/hackers-breach-new-yorks-largest-provider-phishing-attacks    :   2  
##  http://www.healthcareitnews.com/news/surgery-center-says-34000-patient-records-potentially-breached:   2  
##  (Other)                                                                                            :  86  
##  Year.of.Breach    Latitude        Longitude       breach_type_gA
##  Min.   :2006   Min.   :-34.60   Min.   :-158.06   Hacker:1344   
##  1st Qu.:2010   1st Qu.: 36.78   1st Qu.: -95.71   Others:4011   
##  Median :2012   Median : 40.71   Median : -81.03                 
##  Mean   :2012   Mean   : 38.66   Mean   : -86.96                 
##  3rd Qu.:2015   3rd Qu.: 40.76   3rd Qu.: -73.98                 
##  Max.   :2017   Max.   : 64.84   Max.   :   0.00                 
##                                                                  
##   breach_type_gB breach_type_gC breach_type_gD      region_A   
##  Insider :1625   Offline:2181   Local  :2354   Midwest  :1086  
##  Mixed   :2354   Online :1376   Network:2644   Non_US   :   0  
##  Outsider:1376   Others :1798   Others : 357   Northeast:1030  
##                                                South    :1956  
##                                                West     :1283  
##                                                                
##                                                                
##       region_B      age_of_Law              org_group   
##  Midwest  :1086   Min.   : 0.00   F_regulated    :4196  
##  Non_US   :   0   1st Qu.:12.00   non_F_regulated:1159  
##  Northeast:1315   Median :13.00                         
##  Southeast:1226   Mean   :12.55                         
##  Southwest: 579   3rd Qu.:14.00                         
##  West     :1149   Max.   :16.00                         
##                                                         
##        records_group  records_index      records_scale      type_scale   
##  Extraordinary:   3   Min.   :  0.0000   Min.   : 7.311   Min.   : 1.00  
##  Others       :5352   1st Qu.:  0.0008   1st Qu.:10.000   1st Qu.: 7.00  
##                       Median :  0.0024   Median :10.000   Median : 7.00  
##                       Mean   :  0.9722   Mean   : 9.972   Mean   : 6.35  
##                       3rd Qu.:  0.0122   3rd Qu.:10.000   3rd Qu.:10.00  
##                       Max.   :601.2724   Max.   :10.000   Max.   :10.00  
##                                                                          
##    org_scale      severity_index1  severity_index2 severity_index3
##  Min.   : 1.000   Min.   : 8.311   Min.   :10.31   Min.   :12.00  
##  1st Qu.: 8.000   1st Qu.:17.000   1st Qu.:18.00   1st Qu.:21.00  
##  Median :10.000   Median :17.000   Median :20.00   Median :27.00  
##  Mean   : 8.231   Mean   :16.323   Mean   :18.20   Mean   :24.55  
##  3rd Qu.:10.000   3rd Qu.:19.999   3rd Qu.:20.00   3rd Qu.:27.00  
##  Max.   :10.000   Max.   :20.000   Max.   :20.00   Max.   :30.00  
##                                                                   
##  severity_index4  severity_index5 
##  Min.   : 4.000   Min.   : 0.000  
##  1st Qu.: 7.000   1st Qu.: 2.883  
##  Median : 9.000   Median : 3.000  
##  Mean   : 8.185   Mean   : 9.816  
##  3rd Qu.: 9.000   3rd Qu.:21.000  
##  Max.   :10.000   Max.   :27.000  
## 

Univariate Exploration

type of breach

# type of breach
ggplot(df, aes(Type.of.breach)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  ylab('Percentage')

# chi-square test
chisq.test(table(df$Type.of.breach))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$Type.of.breach)
## X-squared = 2616.7, df = 6, p-value < 2.2e-16

‘PHYS’, ‘DISC’ and ‘HACK’ types (3 types out of 8) consist about 70% of the breach.

breach type group B

# breach type group B
ggplot(df) + 
  geom_bar(mapping=aes(breach_type_gB,y=..prop.., group=1), stat = "count")

# chi-square test
chisq.test(table(df$breach_type_gB))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$breach_type_gB)
## X-squared = 289.44, df = 2, p-value < 2.2e-16

breach type group C

# breach type group C
ggplot(df) + 
  geom_bar(mapping=aes(breach_type_gC,y=..prop.., group=1), stat = "count")

# chi-square test
chisq.test(table(df$breach_type_gC))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$breach_type_gC)
## X-squared = 181.66, df = 2, p-value < 2.2e-16

breach type group D

# breach type group D
ggplot(df) + 
  geom_bar(mapping=aes(breach_type_gD,y=..prop.., group=1), stat = "count")

# chi-square test
chisq.test(table(df$breach_type_gD))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$breach_type_gD)
## X-squared = 1737.2, df = 2, p-value < 2.2e-16

Type.of.organization

# Type.of.organization
ggplot(df, aes(Type.of.organization)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  ylab('Percentage')

# chi-square test
chisq.test(table(df$Type.of.organization))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$Type.of.organization)
## X-squared = 10079, df = 6, p-value < 2.2e-16

More than 60% of the breach are from medical organization.

organization group

# org_group
ggplot(df) + 
  geom_bar(mapping=aes(org_group,y=..prop.., group=1), stat = "count")

# chi-square test
chisq.test(table(df$org_group))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$org_group)
## X-squared = 1722.4, df = 1, p-value < 2.2e-16

total records

# Total Records
summary(df$Total.Records)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.000e+00 6.520e+02 2.006e+03 1.811e+06 1.018e+04 3.000e+09
p1 <- qplot(x = Total.Records, data = df,
            xlab = 'US Total Records', ylab = 'Count')
p2 <- qplot(x = Total.Records, data = df, 
            xlab = 'US Total Records (log10)', ylab = 'Count') +
  scale_x_log10() 
# p3 <- qplot(x = Total.Records, data = subset(df, State == 'California'), xlab = 'CA Total Records(log10)', ylab = 'Count') +
# scale_x_log10()
grid.arrange(p1, p2, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of total records is heavily right skewed. After log tranformation, it looks like normal distribution with a mean at 1000.

incidents

# frequency polygon of incidents by year
ggplot() + 
  geom_freqpoly(aes(Year.of.Breach), data = df, color = 'blue', 
                stat = 'count') +
  geom_freqpoly(aes(Year.of.Breach), data = subset(df, State == 'California'),
                color = 'red', stat = 'count') + 
  scale_x_continuous(breaks = seq(2006, 2017, 1))

# chi-square test
chisq.test(table(df$Year.of.Breach))  
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$Year.of.Breach)
## X-squared = 495.62, df = 11, p-value < 2.2e-16
# density plot of incidents by year
names(df)
##  [1] "Date.Made.Public"        "Company"                
##  [3] "State"                   "Type.of.breach"         
##  [5] "Type.of.organization"    "Total.Records"          
##  [7] "Description.of.incident" "Information.Source"     
##  [9] "Source.URL"              "Year.of.Breach"         
## [11] "Latitude"                "Longitude"              
## [13] "breach_type_gA"          "breach_type_gB"         
## [15] "breach_type_gC"          "breach_type_gD"         
## [17] "region_A"                "region_B"               
## [19] "age_of_Law"              "org_group"              
## [21] "records_group"           "records_index"          
## [23] "records_scale"           "type_scale"             
## [25] "org_scale"               "severity_index1"        
## [27] "severity_index2"         "severity_index3"        
## [29] "severity_index4"         "severity_index5"
ggplot() + 
  geom_density(aes(Year.of.Breach), data = df, color = 'blue') +
  geom_density(aes(Year.of.Breach), data = subset(df, State == 'California'), 
               color = 'red') +
  scale_x_continuous(breaks = seq(2006, 2017, 1))

region B

# region B
ggplot(df, aes(region_B)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  ylab('Percentage')

# chi-square test
chisq.test(table(df$region_B))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$region_B)
## X-squared = 1442.9, df = 5, p-value < 2.2e-16

Southwest, which includes ‘Texas’, ‘Oklahoma’, ‘New Mexico’, ‘Arizona’, has relatively less data breach events than other regions.

age of law

# age of law
qplot(x = age_of_Law, data = df,
            xlab = 'Age of Law', ylab = 'Count')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

severity index 1

# severity index 1

ggplot(df, aes(x=severity_index1)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This distribution of severity index 1 is a little left skewed. The peak appears at 17.

severity index 2

# severity index 2

ggplot(df, aes(x=severity_index2)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This distribution of severity index 2 is a heavily left skewed. The peak appears at 20.

severity index 3

# severity index 3

ggplot(df, aes(x=severity_index3)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

severity index 4

# severity index 4

ggplot(df, aes(x=severity_index4)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

severity index 5

# severity index 5

ggplot(df, aes(x=severity_index5)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bivariate exploration & multivariate exploration

type over time

df %>%
  count(Year.of.Breach, Type.of.breach) %>% 
  ggplot(aes(Year.of.Breach, n, group = Type.of.breach, color = Type.of.breach)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') + 
  ylab('Count')

All types have some flutuations, breach of ‘HACK’ and ‘DISC’ type are increasing over the years and have more incidents. The other types are decreasing and have less incidents. Organizations more vulnerable to ‘HACK’ and ‘DISC’ need to pay more attention. In the following part, we will explore which organizations are more vulnerable to ‘HACK’ and ‘DISC’.

type group B over time

df %>%
  count(Year.of.Breach, breach_type_gB) %>% 
  ggplot(aes(Year.of.Breach, n, group = breach_type_gB, color = breach_type_gB)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') + 
  ylab('Count')

Breaches caused by outsiders have been increasing during these years, while breaches caused by insiders have not been increasing too much since year 2013.

difference between insider and outsider over time

io_df <- df %>%
  subset(breach_type_gB %in% c('Insider', 'Outsider')) %>%
  count(Year.of.Breach, breach_type_gB) %>%
  data.frame() %>%
  reshape(idvar = "Year.of.Breach", timevar = 'breach_type_gB', direction = "wide")

io_df$diff <- io_df$n.Outsider - io_df$n.Insider
ggplot(data = io_df, aes(x = Year.of.Breach, y = diff)) + 
  geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') + 
  ylab('Count')

type group C over time

df %>%
  count(Year.of.Breach, breach_type_gC) %>% 
  ggplot(aes(Year.of.Breach, n, group = breach_type_gC, color = breach_type_gC)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') + 
  ylab('Count')

Breaches from offline have been dramatically decresing since year 2010. Breaches from online have been increasing.

type group D over time

df %>%
  count(Year.of.Breach, breach_type_gD) %>% 
  ggplot(aes(Year.of.Breach, n, group = breach_type_gD, color = breach_type_gD)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
xlab('Year of breach') + 
  ylab('Count')

Breaches from network have been increasing while breaches from local have been decreasing.

organizations vulnerable to ‘HACK’ and ‘DISC’.

# organizations vulnerable to 'HACK' and 'DISC'
subset(df, Type.of.breach %in% c('HACK', 'DISC')) %>%
      ggplot(aes(Type.of.organization)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  xlab('Type of organization') + 
  ylab('Percentage')   

# organizations vulnerable to 'HACK'
subset(df, Type.of.breach=='HACK') %>%
      ggplot(aes(Type.of.organization)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  xlab('Type of organization') + 
  ylab('Percentage')   

# organizations vulnerable to 'DISC'
subset(df, Type.of.breach=='DISC') %>%
      ggplot(aes(Type.of.organization)) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  xlab('Type of organization') + 
  ylab('Percentage')    

# org vs type
ggplot(df, aes(Type.of.organization, fill =Type.of.breach )) + 
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) + 
  xlab('Type of organization') + 
  ylab('Percentage') 

More than 50% of the incidents for ‘BSO’, ‘BSR’, ‘EDU’ are of type ‘HACK’ and ‘DISC’.

organization over time

df %>%
  count(Year.of.Breach, Type.of.organization) %>% 
  ggplot(aes(Year.of.Breach, n, group = Type.of.organization, color = Type.of.organization)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) + 
  xlab('Year of breach') + 
  ylab('Count')

Breach in medical is dramatically increasing especially after year 2009. Breach in ‘BSO’ reached its peak in year 2011, it has been decreasing through year 2015 but starts to rebound after that.

So medical and ‘BSO’ need to pay relatively more attention to data breach.

organization group over time

df %>%
  count(Year.of.Breach, org_group) %>% 
  ggplot(aes(Year.of.Breach, n, group = org_group, color = org_group)) +
    geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) + 
  xlab('Year of breach') + 
  ylab('Count')

Although there are some fluctuations. For the overall trend, breaches in federal regulated organizations are increasing, breaches in non federal regulated organizations are decreasing.

organization vs. type vs. incidents

# heat map filled with incidents indensity
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise( n = n()) %>%
  mutate(incidents_indensity = n / sum(n)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = incidents_indensity)) +
  geom_text(aes(label = round(incidents_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

# Chi-Square Test of Independence
chisq.test(table(df$Type.of.organization, df$Type.of.breach))
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$Type.of.organization, df$Type.of.breach)
## X-squared = 1194.7, df = 36, p-value < 2.2e-16

‘BSF’ is more prone to ‘HACK’ and ‘PORT’. ‘BSO’ and ‘BSR’ are more prone to ‘HACK’. ‘EDU’ is more prone to ‘DISC’ and ‘HACK’. ‘GOV’ is more prone to ‘PORT’. ‘MED’ is more prone to ‘PHYS’. ‘NGO’ is more prone to ‘PORT’ and ‘HACK’.

organization vs. type vs. records

# heat map filled with records indensity
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(total_records = sum(Total.Records)) %>%
  mutate(records_indensity = total_records / sum(total_records)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = records_indensity)) +
  geom_text(aes(label = round(records_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

severity_index1 over time

aggregate(df$severity_index1, list(df$Year.of.Breach), mean) %>%
  ggplot(aes(Group.1, x)) +
   geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index 1')

severity_index2 over time

aggregate(df$severity_index2, list(df$Year.of.Breach), mean) %>%
  ggplot(aes(Group.1, x)) +
   geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index 2')

severity_index3 over time

aggregate(df$severity_index3, list(df$Year.of.Breach), mean) %>%
  ggplot(aes(Group.1, x)) +
   geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index 3')

severity_index4 over time

aggregate(df$severity_index4, list(df$Year.of.Breach), mean) %>%
  ggplot(aes(Group.1, x)) +
   geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index 4')

severity_index5 over time

aggregate(df$severity_index5, list(df$Year.of.Breach), mean) %>%
  ggplot(aes(Group.1, x)) +
   geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index 5')

all severity indexes over time

mean_index <- aggregate(df[, 26:30], list(df$Year.of.Breach), mean)

melt_mean_index <- melt(mean_index, id.vars="Group.1")

ggplot(melt_mean_index, aes(Group.1, value, col=variable)) + 
  geom_line() + 
  scale_x_continuous(breaks = seq(2006, 2017, 2)) +
  xlab('Year of breach') +
  ylab('Average severity index')

organization vs. type vs. severity_index1

# heat map filled with severity_index1
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(severity_index = sum(severity_index1)) %>%
  mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = severity_index_indensity)) +
  geom_text(aes(label = round(severity_index_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

In consideration of severity_index1: - BSF need to pay more attention to PORT and HACK - BSO/BSR/EDU need to pay more attention to HACK - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS and HACK - NGO need to pay more attention to PORT and HACK

organization vs. type vs. severity_index2

# heat map filled with severity_index2
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(severity_index = sum(severity_index2)) %>%
  mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = severity_index_indensity)) +
  geom_text(aes(label = round(severity_index_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

In terms of severity_index2: - BSF need to pay more attention to PORT and HACK - BSO/BSR need to pay more attention to HACK - EDU need to pay more attention to HACK and DISC (diff) - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS (diff) - NGO need to pay more attention to PORT and HACK

organization vs. type vs. severity_index3

# heat map filled with severity_index3
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(severity_index = sum(severity_index3)) %>%
  mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = severity_index_indensity)) +
  geom_text(aes(label = round(severity_index_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

In terms of severity_index2: - BSF need to pay more attention to PORT and HACK - BSO/BSR need to pay more attention to HACK - EDU need to pay more attention to HACK and DISC - GOV need to pay more attention to PORT and DISC - MED need to pay more attention to PHYS - NGO need to pay more attention to PORT and HACK

organization vs. type vs. severity_index4

# heat map filled with severity_index4
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(severity_index = sum(severity_index4)) %>%
  mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = severity_index_indensity)) +
  geom_text(aes(label = round(severity_index_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

organization vs. type vs. severity_index5

# heat map filled with severity_index5
df %>%
  group_by(Type.of.organization, Type.of.breach) %>%
  summarise(severity_index = sum(severity_index5)) %>%
  mutate(severity_index_indensity = severity_index / sum(severity_index)) %>%
  ggplot(aes(Type.of.organization, Type.of.breach)) +
  geom_tile(aes(fill = severity_index_indensity)) +
  geom_text(aes(label = round(severity_index_indensity,2))) +
  scale_fill_continuous(low = "white", high = "black")

State vs. severity_index1

# State vs. severity_index1
state_index <- df %>%
  group_by(State) %>%
  summarise(mean = mean(severity_index1))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
##    State          mean
##    <fct>         <dbl>
##  1 North Dakota   18  
##  2 Delaware       17.9
##  3 South Dakota   17.5
##  4 Nebraska       17.2
##  5 Nevada         17.1
##  6 Vermont        17.0
##  7 Idaho          17.0
##  8 New Hampshire  17.0
##  9 Arizona        16.8
## 10 Oklahoma       16.8
## # … with 40 more rows

Type.of.breach vs. severity_index1

# Type.of.breach vs. severity_index1
ggplot(data=df, aes(x = Type.of.breach, y = severity_index1)) +
  geom_boxplot()

breach_type_gB vs. severity_index1

# breach_type_gB vs. severity_index1
ggplot(data=df, aes(x = breach_type_gB, y = severity_index1)) +
  geom_boxplot()

breach_type_gC vs. severity_index1

# breach_type_gC vs. severity_index1
ggplot(data=df, aes(x = breach_type_gC, y = severity_index1)) +
  geom_boxplot()

Type.of.organization vs. severity_index1

# Type.of.organization vs. severity_index1
ggplot(data=df, aes(x = Type.of.organization, y = severity_index1)) +
  geom_boxplot()

org_group vs. severity_index1

# org_group vs. severity_index1
ggplot(data=df, aes(x = org_group, y = severity_index1)) +
  geom_boxplot()

region_B vs. severity_index1

# region_B vs. severity_index1
ggplot(data=df, aes(x = region_B, y = severity_index1)) +
  geom_boxplot()

age_of_Law vs. severity_index1

# age_of_Law vs. severity_index1

ggplot(aes(x = age_of_Law, y = severity_index1), data = df) +
  geom_point(alpha = 0.5)

# correlation test
cor.test(df$age_of_Law, df$severity_index2)
## 
##  Pearson's product-moment correlation
## 
## data:  df$age_of_Law and df$severity_index2
## t = -2.8481, df = 5353, p-value = 0.004414
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06561495 -0.01212663
## sample estimates:
##         cor 
## -0.03889865

State vs. severity_index2

# State vs. severity_index2
state_index <- df %>%
  group_by(State) %>%
  summarise(mean = mean(severity_index2))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
##    State         mean
##    <fct>        <dbl>
##  1 North Dakota  19.6
##  2 Kentucky      19.1
##  3 South Dakota  18.9
##  4 Tennessee     18.8
##  5 Missouri      18.8
##  6 Montana       18.7
##  7 Maryland      18.7
##  8 Alabama       18.7
##  9 Florida       18.7
## 10 Indiana       18.6
## # … with 40 more rows

Type.of.breach vs. severity_index2

# Type.of.breach vs. severity_index2
ggplot(data=df, aes(x = Type.of.breach, y = severity_index2)) +
  geom_boxplot()

breach_type_gB vs. severity_index2

# breach_type_gB vs. severity_index2
ggplot(data=df, aes(x = breach_type_gB, y = severity_index2)) +
  geom_boxplot()

breach_type_gC vs. severity_index2

# breach_type_gC vs. severity_index2
ggplot(data=df, aes(x = breach_type_gC, y = severity_index2)) +
  geom_boxplot()

Type.of.organization vs. severity_index2

# Type.of.organization vs. severity_index2
ggplot(data=df, aes(x = Type.of.organization, y = severity_index2)) +
  geom_boxplot()

org_group vs. severity_index2

# org_group vs. severity_index2
ggplot(data=df, aes(x = org_group, y = severity_index2)) +
  geom_boxplot()

region_B vs. severity_index2

# region_B vs. severity_index2
ggplot(data=df, aes(x = region_B, y = severity_index2)) +
  geom_boxplot()

age_of_Law vs. severity_index2

# age_of_Law vs. severity_index2

ggplot(aes(x = age_of_Law, y = severity_index2), data = df) +
  geom_point(alpha = 0.5)

# correlation test
cor.test(df$age_of_Law, df$severity_index2)
## 
##  Pearson's product-moment correlation
## 
## data:  df$age_of_Law and df$severity_index2
## t = -2.8481, df = 5353, p-value = 0.004414
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06561495 -0.01212663
## sample estimates:
##         cor 
## -0.03889865

State vs. severity_index3

# State vs. severity_index3
state_index <- df %>%
  group_by(State) %>%
  summarise(mean = mean(severity_index3))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
##    State         mean
##    <fct>        <dbl>
##  1 North Dakota  27.6
##  2 South Dakota  26.5
##  3 Nebraska      25.8
##  4 Delaware      25.5
##  5 Nevada        25.5
##  6 Maryland      25.3
##  7 Arizona       25.2
##  8 Kentucky      25.0
##  9 Oklahoma      25.0
## 10 Texas         25.0
## # … with 40 more rows

Type.of.breach vs. severity_index3

# Type.of.breach vs. severity_index3
ggplot(data=df, aes(x = Type.of.breach, y = severity_index3)) +
  geom_boxplot()

breach_type_gB vs. severity_index3

# breach_type_gB vs. severity_index3
ggplot(data=df, aes(x = breach_type_gB, y = severity_index3)) +
  geom_boxplot()

breach_type_gC vs. severity_index3

# breach_type_gC vs. severity_index3
ggplot(data=df, aes(x = breach_type_gC, y = severity_index3)) +
  geom_boxplot()

Type.of.organization vs. severity_index3

# Type.of.organization vs. severity_index3
ggplot(data=df, aes(x = Type.of.organization, y = severity_index3)) +
  geom_boxplot()

org_group vs. severity_index3

# org_group vs. severity_index3
ggplot(data=df, aes(x = org_group, y = severity_index3)) +
  geom_boxplot()

region_B vs. severity_index3

# region_B vs. severity_index3
ggplot(data=df, aes(x = region_B, y = severity_index3)) +
  geom_boxplot()

age_of_Law vs. severity_index3

# age_of_Law vs. severity_index3

ggplot(aes(x = age_of_Law, y = severity_index3), data = df) +
  geom_point(alpha = 0.5)

# correlation test
cor.test(df$age_of_Law, df$severity_index3)
## 
##  Pearson's product-moment correlation
## 
## data:  df$age_of_Law and df$severity_index3
## t = -0.78307, df = 5353, p-value = 0.4336
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03747623  0.01608696
## sample estimates:
##         cor 
## -0.01070231

State vs. severity_index4

# State vs. severity_index4
state_index <- df %>%
  group_by(State) %>%
  summarise(mean = mean(severity_index4))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
##    State         mean
##    <fct>        <dbl>
##  1 North Dakota  9.19
##  2 South Dakota  8.82
##  3 Nebraska      8.59
##  4 Delaware      8.51
##  5 Nevada        8.51
##  6 Maryland      8.43
##  7 Arizona       8.40
##  8 Kentucky      8.32
##  9 Oklahoma      8.32
## 10 Texas         8.32
## # … with 40 more rows

Type.of.breach vs. severity_index4

# Type.of.breach vs. severity_index4
ggplot(data=df, aes(x = Type.of.breach, y = severity_index4)) +
  geom_boxplot()

breach_type_gB vs. severity_index4

# breach_type_gB vs. severity_index4
ggplot(data=df, aes(x = breach_type_gB, y = severity_index4)) +
  geom_boxplot()

breach_type_gC vs. severity_index4

# breach_type_gC vs. severity_index4
ggplot(data=df, aes(x = breach_type_gC, y = severity_index4)) +
  geom_boxplot()

Type.of.organization vs. severity_index4

# Type.of.organization vs. severity_index4
ggplot(data=df, aes(x = Type.of.organization, y = severity_index4)) +
  geom_boxplot()

org_group vs. severity_index4

# org_group vs. severity_index4
ggplot(data=df, aes(x = org_group, y = severity_index4)) +
  geom_boxplot()

region_B vs. severity_index4

# region_B vs. severity_index4
ggplot(data=df, aes(x = region_B, y = severity_index4)) +
  geom_boxplot()

age_of_Law vs. severity_index4

# age_of_Law vs. severity_index4

ggplot(aes(x = age_of_Law, y = severity_index4), data = df) +
  geom_point(alpha = 0.5)

# correlation test
cor.test(df$age_of_Law, df$severity_index4)
## 
##  Pearson's product-moment correlation
## 
## data:  df$age_of_Law and df$severity_index4
## t = -0.78307, df = 5353, p-value = 0.4336
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03747623  0.01608696
## sample estimates:
##         cor 
## -0.01070231

State vs. severity_index5

# State vs. severity_index5
state_index <- df %>%
  group_by(State) %>%
  summarise(mean = mean(severity_index5))
# sort by descending order
state_index[order(-state_index$mean), ]
## # A tibble: 50 x 2
##    State           mean
##    <fct>          <dbl>
##  1 West Virginia   15.3
##  2 Maine           13.7
##  3 Utah            13.4
##  4 Arkansas        13.4
##  5 Hawaii          13.0
##  6 Alaska          12.6
##  7 Wyoming         12.4
##  8 Vermont         12.2
##  9 North Carolina  12.2
## 10 Iowa            12.0
## # … with 40 more rows

Type.of.breach vs. severity_index5

# Type.of.breach vs. severity_index5
ggplot(data=df, aes(x = Type.of.breach, y = severity_index5)) +
  geom_boxplot()

breach_type_gB vs. severity_index5

# breach_type_gB vs. severity_index5
ggplot(data=df, aes(x = breach_type_gB, y = severity_index5)) +
  geom_boxplot()

breach_type_gC vs. severity_index5

# breach_type_gC vs. severity_index5
ggplot(data=df, aes(x = breach_type_gC, y = severity_index5)) +
  geom_boxplot()

Type.of.organization vs. severity_index5

# Type.of.organization vs. severity_index5
ggplot(data=df, aes(x = Type.of.organization, y = severity_index5)) +
  geom_boxplot()

org_group vs. severity_index5

# org_group vs. severity_index5
ggplot(data=df, aes(x = org_group, y = severity_index5)) +
  geom_boxplot()

region_B vs. severity_index5

# region_B vs. severity_index5
ggplot(data=df, aes(x = region_B, y = severity_index5)) +
  geom_boxplot()

age_of_Law vs. severity_index5

# age_of_Law vs. severity_index5

ggplot(aes(x = age_of_Law, y = severity_index5), data = df) +
  geom_point(alpha = 0.5)

# correlation test
cor.test(df$age_of_Law, df$severity_index5)
## 
##  Pearson's product-moment correlation
## 
## data:  df$age_of_Law and df$severity_index5
## t = -0.29505, df = 5353, p-value = 0.768
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03081402  0.02275443
## sample estimates:
##          cor 
## -0.004032687